1. Cleaning variables

library(lubridate)
## Warning: 程辑包'lubridate'是用R版本4.2.3 来建造的
## 
## 载入程辑包:'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
d = read.csv("full.csv")
d$day = day(d$ED)

d = d[, !names(d) %in% c("total_load", "pv_est")]
d = d[, !names(d) %in% c("dt", "ACDT", "ED", "DST", "yfrac")]

d = na.omit(d)
head(d)
##    temp cloud wind wdirection humidity rain radiation demand isDST year month
## 1 20.75   2.5 14.5        135     44.5    0      1915   1288  TRUE 2018     3
## 2 21.50   1.0 16.0        140     40.0    0      2340   1237  TRUE 2018     3
## 3 22.25   1.5 15.5        145     37.0    0      2570   1189  TRUE 2018     3
## 4 23.00   2.0 15.0        150     34.0    0      2800   1150  TRUE 2018     3
## 5 23.55   2.0 13.0        145     32.0    0      2945   1122  TRUE 2018     3
## 6 24.10   2.0 11.0        140     30.0    0      3090   1102  TRUE 2018     3
##   hour minutes yday dw day
## 1    8      30   65  2   6
## 2    9       0   65  2   6
## 3    9      30   65  2   6
## 4   10       0   65  2   6
## 5   10      30   65  2   6
## 6   11       0   65  2   6

2. Feature selection

cor_matrix <- cor(d, method = "pearson")
cor_matrix <- round(cor_matrix, 2)
as.data.frame(cor_matrix)
##             temp cloud  wind wdirection humidity  rain radiation demand isDST
## temp        1.00 -0.11  0.25       0.15    -0.73 -0.11      0.52  -0.15  0.53
## cloud      -0.11  1.00  0.19       0.12     0.20  0.26     -0.18   0.14 -0.17
## wind        0.25  0.19  1.00       0.35    -0.29  0.18      0.31  -0.14  0.07
## wdirection  0.15  0.12  0.35       1.00     0.03  0.12      0.30  -0.17  0.07
## humidity   -0.73  0.20 -0.29       0.03     1.00  0.23     -0.46   0.17 -0.32
## rain       -0.11  0.26  0.18       0.12     0.23  1.00     -0.14   0.12 -0.09
## radiation   0.52 -0.18  0.31       0.30    -0.46 -0.14      1.00  -0.56  0.27
## demand     -0.15  0.14 -0.14      -0.17     0.17  0.12     -0.56   1.00 -0.25
## isDST       0.53 -0.17  0.07       0.07    -0.32 -0.09      0.27  -0.25  1.00
## year        0.07 -0.02 -0.01      -0.01     0.08  0.02      0.03  -0.10  0.13
## month      -0.20  0.09  0.06       0.03     0.03  0.04      0.05  -0.09  0.01
## hour        0.12 -0.07  0.08       0.12    -0.10  0.00     -0.01   0.24  0.00
## minutes     0.00  0.00  0.00       0.00     0.00  0.00      0.00  -0.01  0.00
## yday       -0.20  0.08  0.06       0.03     0.03  0.04      0.05  -0.09  0.01
## dw          0.00  0.02 -0.01       0.03     0.04  0.03      0.00  -0.14  0.00
## day         0.01 -0.04 -0.04      -0.01     0.01 -0.01      0.01   0.02  0.00
##             year month  hour minutes  yday    dw   day
## temp        0.07 -0.20  0.12    0.00 -0.20  0.00  0.01
## cloud      -0.02  0.09 -0.07    0.00  0.08  0.02 -0.04
## wind       -0.01  0.06  0.08    0.00  0.06 -0.01 -0.04
## wdirection -0.01  0.03  0.12    0.00  0.03  0.03 -0.01
## humidity    0.08  0.03 -0.10    0.00  0.03  0.04  0.01
## rain        0.02  0.04  0.00    0.00  0.04  0.03 -0.01
## radiation   0.03  0.05 -0.01    0.00  0.05  0.00  0.01
## demand     -0.10 -0.09  0.24   -0.01 -0.09 -0.14  0.02
## isDST       0.13  0.01  0.00    0.00  0.01  0.00  0.00
## year        1.00 -0.17  0.00    0.00 -0.17  0.00 -0.02
## month      -0.17  1.00  0.00    0.00  1.00  0.00  0.01
## hour        0.00  0.00  1.00    0.00  0.00  0.00  0.00
## minutes     0.00  0.00  0.00    1.00  0.00  0.00  0.00
## yday       -0.17  1.00  0.00    0.00  1.00  0.00  0.09
## dw          0.00  0.00  0.00    0.00  0.00  1.00  0.00
## day        -0.02  0.01  0.00    0.00  0.09  0.00  1.00
library(corrplot)
## Warning: 程辑包'corrplot'是用R版本4.2.3 来建造的
## corrplot 0.92 loaded
corrplot(cor_matrix, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

3. Fitting Model

Fit all data before 2023 used for training

# X_train = d[d$year < 2023,][,names(d) != "demand"]
# y_train = d[d$year < 2023,][,names(d) == "demand"]
# X_test = d[d$year >= 2023,][,names(d) != "demand"]
# y_test = d[d$year >= 2023,][,names(d) == "demand"]

fitdata = d[d$year < 2023, ]
tstdata = d[d$year >= 2023, ]

library(randomForest)
## Warning: 程辑包'randomForest'是用R版本4.2.3 来建造的
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!DONT RUN AGAIN
rf = randomForest(demand~ ., fitdata)

4. By year

mergedata = tstdata
mergedata$preds = predict(rf, mergedata)


mse = function(pre, act){mean((pre - act)^2)}

R2 = function(pre, act) 1 - sum((pre - act)^2) / sum((act - mean(act))^2) 

mse_year = mse(mergedata$preds, mergedata$demand)
mse_year
## [1] 36284.71
R2_year = R2(mergedata$preds, mergedata$demand)
R2_year
## [1] 0.8090011

5. By day

mergedata$date <- as.Date(paste("2023", mergedata$month, mergedata$day, sep = "-"))
sum_demand2 <- aggregate((preds-demand)^2 ~ date, mergedata, sum)
colnames(sum_demand2)[2] <- "sum_top"

daily_mean <- aggregate(demand ~ date, mergedata, mean)
mergedata <- merge(mergedata, daily_mean, by = "date", suffixes = c("", "_mean"))

sum_demand3 = aggregate((demand -demand_mean)^2 ~ date, mergedata, sum)
colnames(sum_demand3)[2] <- "sum_bottom"

mix_data = merge(sum_demand2, sum_demand3, by = "date")
mix_data$R2 = 1-mix_data$sum_top/mix_data$sum_bottom
# aggregated_data <- aggregate(. ~ date, mergedata, sum)

5.1 R-square by day

library(plotly)
## Warning: 程辑包'plotly'是用R版本4.2.3 来建造的
## 载入需要的程辑包:ggplot2
## Warning: 程辑包'ggplot2'是用R版本4.2.3 来建造的
## 
## 载入程辑包:'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## 
## 载入程辑包:'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot <- plot_ly(mix_data, x = ~date, y = ~R2, type = "scatter", mode = "lines")
plot

5.2 Residuals by day

# aggregate predictions, actuals and residuals by day
sum_demand3 = mergedata %>% group_by(date) %>% 
  summarise(
    preds_sumbyday = sum(preds),
    demand_sumbyday = sum(demand),
    rsd_byday = sum(preds) - sum(demand)
    )

# sum_demand3
plot_ly(
        x = sum_demand3$date, y= sum_demand3$preds_sumbyday,
        type = "scatter",
        mode = "lines",
        name = "2023-Preds") %>% 
  add_lines(sum_demand3$date, sum_demand3$demand_sumbyday, name = "2023-Actual") %>% 
  layout(xaxis = list(title = "2023 Date"), yaxis = list(title = "Demand"))
plot_ly(sum_demand3, x = ~date, y = ~rsd_byday, type = "scatter", mode = "lines")%>%
  layout(xaxis = list(title = "2023 Date"), yaxis = list(title = "Residual by day"))

5.3 MSE by day

mergedata$mse_2 = (mergedata$preds - mergedata$demand)^2
mse_demand = mergedata %>% group_by(date) %>% 
  summarise(
    preds_sumbyday = sum(preds),
    demand_sumbyday = sum(demand),
    mse_byday = mean(mse_2)
    )
# mse_demand
plot_ly(mse_demand, x = ~date, y = ~mse_byday, type = "scatter", mode = "lines")%>%
  layout(xaxis = list(title = "2023 Date"), yaxis = list(title = "MSE by day"))

Discussions:

  • JAN 24th + Feb 24th, abnormally high demand? (24th 9:45 Adelaide soccer??)